#| label: define-cd-sales-panel
#| include: false
# Build CD-Level Sales Panel from DOF + PLUTO
build_cd_sales_panel <- function(
pre_years = c(2017, 2018, 2019),
post_years = c(2021, 2022, 2023),
boroughs = c("bronx", "brooklyn", "manhattan", "queens", "staten_island"),
pluto_xwalk = NULL
) {
# 0. Dependencies / inputs
if (is.null(pluto_xwalk)) {
pluto_xwalk <- get_pluto_cd_crosswalk()
}
# 1. Year/borough grid
combos <- expand_grid(
year = c(pre_years, post_years),
borough = boroughs
) |>
arrange(year, borough)
message("=== Building CD sales panel ===")
message("Pre-COVID years: ", paste(pre_years, collapse = ", "))
message("Post-COVID years: ", paste(post_years, collapse = ", "))
sales_list <- vector("list", nrow(combos))
match_summary_vec <- vector("list", nrow(combos))
# 2. Loop over year × borough combos
for (i in seq_len(nrow(combos))) {
yr <- combos$year[i]
bo <- combos$borough[i]
message(" [", i, "/", nrow(combos), "] Processing year ", yr, " – ", bo)
sales <- get_dof_sales_year_boro(yr, bo)
if (is.null(sales) || nrow(sales) == 0L) {
warning(" No usable rows for ", yr, " ", bo)
match_summary_vec[[i]] <- tibble(
year = yr,
borough = bo,
n_total = 0L,
n_matched = 0L,
match_rate = NA_real_
)
next
}
# 2a. Match stats vs. PLUTO crosswalk (per-file diagnostic)
n_total <- nrow(sales)
matched <- inner_join(sales, pluto_xwalk, by = "bbl")
n_matched <- nrow(matched)
match_rate <- if (n_total > 0L) n_matched / n_total else NA_real_
message(
sprintf(
" n_total = %s, n_matched = %s (%.1f%%)",
comma(n_total),
comma(n_matched),
100 * match_rate
)
)
sales_list[[i]] <- sales |>
mutate(
year = yr,
borough = bo
)
match_summary_vec[[i]] <- tibble(
year = yr,
borough = bo,
n_total = n_total,
n_matched = n_matched,
match_rate = match_rate
)
}
# 3. Bind all years/boroughs together
all_sales <- bind_rows(sales_list)
match_summary_all <- bind_rows(match_summary_vec)
message("")
message("=== Per-File Match Summary ===")
# print(match_summary_all |> arrange(year, borough), n = Inf)
assign("sales_match_summary", match_summary_all, envir = .GlobalEnv)
# 4. Collapse to CD–year panel with Enhanced Matching
# 4a. First attempt: Exact BBL match
# This captures ~75% of transactions with standard tax lot BBLs
sales_matched_exact <- all_sales |>
inner_join(pluto_xwalk, by = "bbl")
n_exact <- nrow(sales_matched_exact)
# 4b. Second attempt: Block-level matching for unmatched sales
# Unmatched sales are typically condos with billing BBLs that don't appear
# in PLUTO's standard BBL crosswalk. We recover these by matching to their
# physical city block, then assigning the most common CD for that block.
# This fallback approach recovers ~25% of transactions that would otherwise be lost.
sales_unmatched <- all_sales |>
anti_join(pluto_xwalk, by = "bbl")
if (nrow(sales_unmatched) > 0) {
message("")
message("=== Attempting Block-Level Matching ===")
message("Unmatched sales: ", comma(nrow(sales_unmatched)))
# Create block-level lookup: for each block, assign the most common CD
# Handles edge cases where blocks span multiple CDs by taking majority vote
pluto_block_lookup <- pluto_xwalk |>
mutate(
block = as.integer(substr(bbl, 2, 6)), # Extract block (positions 2-6)
boro_digit = substr(bbl, 1, 1) # Extract borough (position 1)
) |>
count(boro_digit, block, cd_id, boro_cd) |>
group_by(boro_digit, block) |>
slice_max(n, n = 1, with_ties = FALSE) |> # Take most frequent CD per block
ungroup() |>
select(boro_digit, block, cd_id, boro_cd)
# Match unmatched sales by block
sales_matched_block <- sales_unmatched |>
mutate(boro_digit = substr(bbl, 1, 1)) |>
inner_join(
pluto_block_lookup,
by = c("boro_digit", "block")
) |>
select(-boro_digit)
n_block <- nrow(sales_matched_block)
message("Block-matched: ", comma(n_block))
message("Still unmatched: ", comma(nrow(sales_unmatched) - n_block))
# Combine exact and block matches
sales_with_cd <- bind_rows(
sales_matched_exact,
sales_matched_block
)
} else {
sales_with_cd <- sales_matched_exact
n_block <- 0
}
# 4c. Aggregate to CD-year level
# Create period labels and collapse transaction-level data to CD-year medians
cd_panel <- sales_with_cd |>
mutate(
period = case_when(
year %in% pre_years ~ "pre_covid",
year %in% post_years ~ "post_covid",
TRUE ~ NA_character_
)
) |>
filter(!is.na(period)) |>
group_by(cd_id, boro_cd, year, period) |>
summarise(
n_sales = n(),
median_price = median(sale_price, na.rm = TRUE),
mean_price = mean(sale_price, na.rm = TRUE),
sd_price = sd(sale_price, na.rm = TRUE),
total_sales_vol = sum(sale_price, na.rm = TRUE),
median_gsf = median(gross_square_feet, na.rm = TRUE),
n_gsf_nonmissing = sum(!is.na(gross_square_feet)),
.groups = "drop"
) |>
arrange(year, cd_id)
# 5. Overall match statistics
total_sales_all_files <- nrow(all_sales)
total_sales_matched <- n_exact + n_block
overall_match_rate <- total_sales_matched / total_sales_all_files
message("")
message("=== Overall Match Statistics ===")
message("Total sales (all files): ", comma(total_sales_all_files))
message("Exact BBL matches: ", comma(n_exact))
message("Block-level matches: ", comma(n_block))
message("Sales matched to CDs: ", comma(total_sales_matched))
message("Sales NOT matched: ",
comma(total_sales_all_files - total_sales_matched))
message("Overall match rate: ",
round(100 * overall_match_rate, 1), "%")
if (overall_match_rate < 0.75) {
warning("Overall match rate is below 75% - review BBL construction logic")
}
# 6. Validate CD coverage
cd_sf <- get_nyc_cd()
expected_cds <- unique(cd_sf$cd_id)
actual_cds <- unique(cd_panel$cd_id)
missing_cds <- setdiff(expected_cds, actual_cds)
if (length(missing_cds) > 0) {
warning("Missing CDs in final panel: ",
paste(missing_cds, collapse = ", "))
message(" These CDs may have no matching sales in the selected years")
}
# 7. Flag low-volume CD-years
low_volume <- cd_panel |>
filter(n_sales < 50)
if (nrow(low_volume) > 0) {
message("")
message("Warning: ", nrow(low_volume),
" CD-year combinations have fewer than 50 sales:")
print(
low_volume |>
select(cd_id, year, period, n_sales) |>
arrange(n_sales)
)
}
# 8. Final summary
message("")
message("=== Final CD Sales Panel ===")
message("CD sales panel built with ", nrow(cd_panel), " rows.")
message("Distinct CDs: ", n_distinct(cd_panel$cd_id))
message("Years: ", paste(sort(unique(cd_panel$year)), collapse = ", "))
message("Periods: ", paste(sort(unique(cd_panel$period)), collapse = ", "))
cd_panel
}